Sieadatun Naher Kanon¶

ID: 1901018¶

Part A¶

In [13]:
import pandas as pd
import numpy as np
In [14]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
In [19]:
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
In [20]:
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import warnings
In [25]:
!pip install --upgrade seaborn matplotlib
Requirement already satisfied: seaborn in c:\users\kanon\anaconda3\lib\site-packages (0.12.2)
Collecting seaborn
  Downloading seaborn-0.13.0-py3-none-any.whl (294 kB)
     ------------------------------------ 294.6/294.6 kB 520.8 kB/s eta 0:00:00
Requirement already satisfied: matplotlib in c:\users\kanon\anaconda3\lib\site-packages (3.7.0)
Collecting matplotlib
  Downloading matplotlib-3.8.0-cp310-cp310-win_amd64.whl (7.6 MB)
     ----------------------------             5.4/7.6 MB 664.8 kB/s eta 0:00:04
ERROR: Wheel 'matplotlib' located at C:\Users\Kanon\AppData\Local\Temp\pip-unpack-d6bdh951\matplotlib-3.8.0-cp310-cp310-win_amd64.whl is invalid.
In [26]:
import pandas as pd
df = pd.read_csv('Mall_Customers.csv')
df = df.dropna()
df
Out[26]:
CustomerID Genre Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
... ... ... ... ... ...
195 196 Female 35 120 79
196 197 Female 45 126 28
197 198 Male 32 126 74
198 199 Male 32 137 18
199 200 Male 30 137 83

200 rows × 5 columns

In [21]:
df.describe()
Out[21]:
CustomerID Age Annual Income (k$) Spending Score (1-100)
count 200.000000 200.000000 200.000000 200.000000
mean 100.500000 38.850000 60.560000 50.200000
std 57.879185 13.969007 26.264721 25.823522
min 1.000000 18.000000 15.000000 1.000000
25% 50.750000 28.750000 41.500000 34.750000
50% 100.500000 36.000000 61.500000 50.000000
75% 150.250000 49.000000 78.000000 73.000000
max 200.000000 70.000000 137.000000 99.000000
In [22]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Genre                   200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB
In [28]:
sns.distplot(df['Age'])
C:\Users\Kanon\AppData\Local\Temp\ipykernel_2344\3255828239.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df['Age'])
Out[28]:
<Axes: xlabel='Age', ylabel='Density'>
In [30]:
# distribution plots

plt.figure(figsize = (15, 5))
plotnumber = 1

for col in ['Annual Income (k$)', 'Spending Score (1-100)']:
    if plotnumber <= 3:
        ax = plt.subplot(1, 3, plotnumber)
        sns.distplot(df[col])
        
    plotnumber += 1
    
plt.tight_layout()
plt.show()
C:\Users\Kanon\AppData\Local\Temp\ipykernel_2344\2465352626.py:9: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df[col])
C:\Users\Kanon\AppData\Local\Temp\ipykernel_2344\2465352626.py:9: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(df[col])
In [ ]:
 
In [39]:
import pandas as pd
import matplotlib.pyplot as plt
# Sample data (replace with your actual data)
data = {
'Age': [25, 30, 35, 40],
'Count': [20, 30, 15, 10]
}
df = pd.DataFrame(data)
# Extract data for the pie chart
values = df['Count']
labels = df['Age']
explode = (0.1, 0, 0, 0) # Explode the first section for emphasis
colors = ['purple', 'pink', 'green', 'blue']
fig, ax = plt.subplots(figsize=(8, 8), dpi=100)

# Create the pie chart
patches, texts, autotexts = ax.pie(
values,
labels=labels,
autopct='%1.1f%%',
shadow=True,
startangle=90,
explode=explode,
colors=colors,
labeldistance=1.05, # Adjust label distance for separation
pctdistance=0.85, # Adjust percentage label distance
rotatelabels=True, # Rotate the labels for better readability
)
# Customize text colors
plt.setp(texts, size=12, weight='bold', color='black')
plt.setp(autotexts, size=14, weight='bold', color='white')
# Add a title
ax.set_title("Age Distribution")
# Display the pie chart
plt.show()
In [ ]:
 
In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

d = pd.read_csv("Mall_Customers.csv")
print (d)

X = d[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

sse = []

for k in range(1, 16):

    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)

plt.figure(figsize=(10, 5))
plt.plot(range(1, 16), sse, marker='o', linestyle='--')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()
     CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
0             1    Male   19                  15                      39
1             2    Male   21                  15                      81
2             3  Female   20                  16                       6
3             4  Female   23                  16                      77
4             5  Female   31                  17                      40
..          ...     ...  ...                 ...                     ...
195         196  Female   35                 120                      79
196         197  Female   45                 126                      28
197         198    Male   32                 126                      74
198         199    Male   32                 137                      18
199         200    Male   30                 137                      83

[200 rows x 5 columns]
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
In [32]:
km = KMeans(n_clusters=5)
km.fit(X)
y = km.predict(X)
X['label'] = y
X.head()
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1382: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
C:\Users\Kanon\AppData\Local\Temp\ipykernel_2344\482065266.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['label'] = y
Out[32]:
Age Annual Income (k$) Spending Score (1-100) label
0 19 15 39 4
1 21 15 81 3
2 20 16 6 4
3 23 16 77 3
4 31 17 40 4
In [33]:
import plotly.express as px
import plotly.graph_objs as go

fig = px.scatter_3d(X, x="Annual Income (k$)", y="Spending Score (1-100)", z="Age",
                    color = 'label', size = 'label')
fig.show()

Part B¶

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
In [41]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
In [53]:
from scipy.cluster.hierarchy import dendrogram, linkage

plt.figure(figsize = (12, 8))

dendo = dendrogram(linkage(X, method = 'ward'))
plt.title('Dendrogram', fontsize = 15)
plt.show()
In [54]:
agc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
labels = agc.fit_predict(X)
C:\Users\Kanon\anaconda3\lib\site-packages\sklearn\cluster\_agglomerative.py:983: FutureWarning:

Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead

In [59]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming you have performed hierarchical clustering and have labels assigned to data points
# 'labels' should contain the cluster assignments for each data point

# Create a scatter plot for each cluster
for label in set(labels):
    plt.scatter(X[labels == label, 0], X[labels == label, 1],
                label=f'Cluster {label + 1}', s=100)

plt.title('Hierarchical Clustering')
plt.xlabel('Annual Income (k$)')  # Replace with your actual feature labels
plt.ylabel('Spending Score (1-100)')  # Replace with your actual feature labels
plt.legend()
plt.show()

Part C¶

In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Load the CSV data
data = pd.read_csv('Mall_Customers.csv')


# Standardize the features (if necessary)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Create a DBSCAN clustering model
dbscan = DBSCAN(eps=0.3, min_samples=5)
dbscan.fit(X)

# Get the labels assigned to each data point
labels = dbscan.labels_
# Create a scatter plot to visualize the clusters
plt.figure(figsize=(12, 8))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='winter')
plt.title('DBSCAN Clustering')
plt.xlabel('Annual Income(k$)')
plt.ylabel('Spending Score(1-100)')
plt.show()
In [ ]: